####################################################################
# Lipps, Jana & Dominik Schraff. Estimating subnational preferences 
# across the European Union. Political Science Research and Methods.
####################################################################

# Session info
# R version 3.5.1 (2018-07-02)
# Platform: x86_64-w64-mingw32/x64 (64-bit)
# Running under: Windows >= 8 x64 (build 9200)

#Packages
#Version: ggplot2_3.1.0  
library(ggplot2)
#Version: lme4_1.1-17
library(lme4)
#Version: survey_3.33-2
library(survey)
#Version: glmnet_2.0-16
library(glmnet)
#Version: gridExtra_2.3
library(gridExtra)
#Version: sjPlot_2.6.2
library(sjPlot)
#Version: sjstats_0.17.3
library(sjstats)
#Version: stargazer_5.2.2
library(stargazer)
#Version: BayesTree_0.3-1.4
library(BayesTree)
#Version: plyr_1.8.4
library(plyr)
#Version: arm_1.10-1 
library(arm)

#set working directory to folder containig replication files
#setwd("...")

#Load data
load("contmari.RData") #Contingency table for classical post-stratification 
load("extsynth.RData") #Contingency table for synthetic post-stratification
load("std_eb833_prep_2703.RData") #Eurobarometer 83.3
load("realm_prep_2703.RData") #Reference values (means) from Flash EB
load("realse_prep_2703.RData") #Reference values (std. errors) from Flash EB

#RMSE (extra weight to large errors)
rmse <- function(error)
{
  sqrt(mean(error^2))
}

#MAE (equal weight to all errors)
mae <- function(error)
{
  mean(abs(error))
}

#############
#Analysis

##Disaggregation

#Compare flash_eb values to std_eb values
std_eb.w <- svydesign(ids = ~1, data = std_eb, weights = std_eb$w1)
summary(std_eb.w)

std_eb.w$variables$eu_trust <- as.numeric(as.character(std_eb.w$variables$eu_trust))

stdeb_m <- 0
stdeb_v <- 0

for (i in unique(std_eb.w$variables$nuts)){
  stdeb_m[i] <- as.numeric(svymean(~eu_trust, subset(std_eb.w, std_eb.w$variables$nuts==i), na.rm=T)[1])
}
stdeb_m <- data.frame(stdeb_m[2:187], row.names = unique(std_eb.w$variables$nuts)[1:186])

for (i in unique(std_eb.w$variables$nuts)){
  stdeb_v[i] <- as.numeric(svyvar(~eu_trust, subset(std_eb.w, std_eb.w$variables$nuts==i), na.rm=T)[1])
}
stdeb_v <- data.frame(stdeb_v[2:187], row.names = unique(std_eb.w$variables$nuts)[1:186])

stdeb_std <- sqrt(stdeb_v)

means <- merge(stdeb_m, realm, by = "row.names", all.x = T)
means <- merge(means, realse, by.x = "Row.names", by.y = "row.names", all.x = T)

names(means) <- c("nuts", "steb", "real", "realstd")
means$real.ci.u <- means$real + means$realstd*1.96
means$real.ci.l <- means$real - means$realstd*1.96

#############
#MRP Analysis

#MrsP
ps.tab <- extsynth

m1 <- glmer(eu_trust~(1|nuts)+(1|isocntry)+(1|lr.cat)+(1|region)+
              (1|ager2)+(1|marital_r1)+(1|gender)+
              stdgdp, 
              family=binomial(link="logit"), data=std_eb)
summary(m1)

ps.tab$m1_cellpred <- invlogit(fixef(m1)["(Intercept)"]
                                    +ranef(m1)$nuts[as.character(ps.tab$nuts),1]
                                    +ranef(m1)$region[as.character(ps.tab$region),1]
                                    +ranef(m1)$isocntry[as.character(ps.tab$isocntry),1]
                                    +ranef(m1)$ager2[as.character(ps.tab$ager2),1]
                                    +ranef(m1)$marital_r1[as.character(ps.tab$marital_r1),1]
                                    +ranef(m1)$gender[as.character(ps.tab$gender),1]
                                    +ranef(m1)$lr.cat[as.character(ps.tab$lr.cat),1]
                                    +fixef(m1)["stdgdp"]*as.numeric(as.character(ps.tab$stdgdp))
                                  )

#Calculate the weighted prediction          
ps.tab$m1_wpred <- ps.tab$Share.adj*ps.tab$m1_cellpred 

ps.tab$nuts <- as.character(ps.tab$nuts)

#Get the regional predictions
m1pred <- ddply(ps.tab, .(nuts), summarize, postm1=sum(m1_wpred, na.rm = T), 
                 mlm1=mean(m1_cellpred, na.rm = T))

m1.final <- merge(means, m1pred, by = "nuts")
head(m1.final)

m1.final$postm1[m1.final$postm1== 0] <- NA
m1.final <- na.omit(m1.final)

#Pearson's rho
cor(m1.final$real, m1.final[,c(2,7,8)])

#Errors
sapply(m1.final$real - m1.final[,c(2,7,8)], mae)
sapply(m1.final$real - m1.final[,c(2,7,8)], rmse)

#Calculate coverage
m1.final$cov.steb <- 0
m1.final$cov.steb[m1.final$steb>m1.final$real.ci.l & m1.final$steb<m1.final$real.ci.u] <- 1
prop.table(table(m1.final$cov.steb))

m1.final$cov.postm1 <- 0
m1.final$cov.postm1[m1.final$postm1>m1.final$real.ci.l & m1.final$postm1<m1.final$real.ci.u] <- 1
prop.table(table(m1.final$cov.postm1))

#Classical MrP
ps.tab <- cont.mari

m1 <- glmer(eu_trust~(1|nuts)+(1|isocntry)+(1|gender)+(1|ager2)+(1|marital_r1)+(1|region)+
              +stdgdp, 
            family=binomial(link="logit"), data=std_eb)
summary(m1)

ps.tab$m1_cellpred <- invlogit(fixef(m1)["(Intercept)"]
                               +ranef(m1)$nuts[as.character(ps.tab$nuts),1]
                               +ranef(m1)$region[as.character(ps.tab$region),1]
                               +ranef(m1)$isocntry[as.character(ps.tab$isocntry),1]
                               +ranef(m1)$gender[as.character(ps.tab$gender),1]
                               +ranef(m1)$ager2[as.character(ps.tab$ager2),1]
                               +ranef(m1)$marital_r1[as.character(ps.tab$marital_r1),1]
                               +fixef(m1)["stdgdp"]*as.numeric(as.character(ps.tab$stdgdp))
)

#Calculate the weighted prediction          
ps.tab$m1_wpred <- ps.tab$popshare*ps.tab$m1_cellpred 

ps.tab$nuts <- as.character(ps.tab$nuts)

#Get the regional predictions
m1pred <- ddply(ps.tab, .(nuts), summarize, postm1=sum(m1_wpred, na.rm = T), 
                mlm1=mean(m1_cellpred, na.rm = T))

results <- merge(m1.final, m1pred, by = "nuts")
names(results) <- c("nuts", "steb", "real", "realstd", "real.ci.u",
                    "real.ci.l", "post.sj", "ml.sj", "cov.steb",
                    "cov.post.sj", "post.cj", "ml.cj")
head(results)

mrp.fullres <- merge(means, m1pred, by = "nuts")

#Pearson's rho
cor(results$real, results[,c(2,7,8,11,12)])

#Errors
sapply(results$real - results[,c(2,7,8,11,12)], mae) 
      
sapply(results$real - results[,c(2,7,8,11,12)], rmse) 

#Coverage      
results$cov.post.cj <- 0
results$cov.post.cj[results$post.cj>results$real.ci.l & results$post.cj<results$real.ci.u] <- 1
prop.table(table(results$cov.post.cj))
prop.table(table(results$cov.post.sj))

##############
#BART analysis

###
#BART with classical joints and regional predictors
#Note: MCMC algorithm takes a while for calculation (approx. 20 minutes)
dat.train <- std_eb[,c("nuts", "isocntry", "ager2", "marital_r1", "gender", "eu_trust",
                       "gdp", "pop2", "area2", "edu2", "inc2")]
dat.train <- ddply(dat.train, .(isocntry), transform,
                   stdgdp.gr = gdp - mean(gdp, na.rm = T),
                   stdpopd1.gr = pop2 - mean(pop2, na.rm = T),
                   stdarea1.gr = area2 - mean(area2, na.rm = T),
                   stdedu1.gr = edu2 - mean(edu2, na.rm = T),
                   stdinc1.gr = inc2 - mean(inc2, na.rm = T),
                   gdp.cntry=mean(gdp, na.rm = T),
                   popd1.cntry=mean(pop2, na.rm = T),
                   area1.cntry=mean(area2, na.rm = T),
                   edu1.cntry=mean(edu2, na.rm = T),
                   inc1.cntry=mean(inc2, na.rm = T))
dat.train <- dat.train[,-c(2,7:11)]

dat.train <- na.omit(dat.train)
dat.train$nuts <- as.factor(dat.train$nuts)
dat.train$gender <- as.factor(dat.train$gender)
dat.train$ager2 <- as.factor(dat.train$ager2)
dat.train$marital_r1 <- as.factor(dat.train$marital_r1)
dat.train$eu_trust <- as.factor(dat.train$eu_trust)

dat.train.x <- makeind(dat.train[,-5])

dat.test <- expand.grid(nuts=unique(dat.train$nuts),
                        gender= c("Man", "Woman"),
                        ager2 = unique(std_eb$ager2),
                        marital_r1= unique(std_eb$marital_r1)[1:4])
dat.test <- merge(dat.test, unique(dat.train[,c(1,6:15)]), by = "nuts")
dat.test.x <- makeind(dat.test)

set.seed(99)
system.time(bart.result <- bart(x.train=dat.train.x,
                                y.train=dat.train[,5],
                                x.test=dat.test.x,
                                binaryOffset=0,
                                k=10,
                                ndpost=1000))

plot(bart.result)
bart.result$yhat.test.p <- pnorm(bart.result$yhat.test)
bart.meds <- colMeans(bart.result$yhat.test)
bart.meds.p <- colMeans(bart.result$yhat.test.p)

dat.test$preds <-  bart.meds.p

final.bart.p <- merge(dat.test, cont.mari[,c(1:7)], by=c("nuts", "gender", "ager2", "marital_r1"))

tot.share <- ddply(final.bart.p, .(nuts), summarise, totshare = sum(popshare))
final.bart.p <- merge(final.bart.p, tot.share, by = "nuts")

final.bart.p$popshare.adj <- final.bart.p$popshare/final.bart.p$totshare

bart.reg <- ddply(final.bart.p, .(nuts), summarize, bart.p.cj=weighted.mean(preds, popshare.adj),
                  bart.cj=mean(preds))

results <- merge(results, bart.reg, by="nuts")

#Pearson's rho
cor(results$real, results[,c(2,7,8,11,12,14,15)])

#Errors
sapply(results$real - results[,c(2,7,8,11,12,14,15)], mae) 

sapply(results$real - results[,c(2,7,8,11,12,14,15)], rmse) 

###
#BART with synthetic joints and regional predictors
#Note: MCMC algorithm takes a while for calculation (approx. 20 minutes)

dat.train <- std_eb[,c("nuts", "isocntry", "lr.cat", "ager2", "marital_r1", "gender", "eu_trust",
                       "gdp", "pop2", "area2", "edu2", "inc2")]
dat.train <- ddply(dat.train, .(isocntry), transform,
                   stdgdp.gr = gdp - mean(gdp, na.rm = T),
                   stdpopd1.gr = pop2 - mean(pop2, na.rm = T),
                   stdarea1.gr = area2 - mean(area2, na.rm = T),
                   stdedu1.gr = edu2 - mean(edu2, na.rm = T),
                   stdinc1.gr = inc2 - mean(inc2, na.rm = T),
                   gdp.cntry=mean(gdp, na.rm = T),
                   popd1.cntry=mean(pop2, na.rm = T),
                   area1.cntry=mean(area2, na.rm = T),
                   edu1.cntry=mean(edu2, na.rm = T),
                   inc1.cntry=mean(inc2, na.rm = T))
dat.train <- dat.train[,-c(2,8:12)]

dat.train <- na.omit(dat.train)
dat.train$nuts <- as.factor(dat.train$nuts)
dat.train$lr.cat <- as.factor(dat.train$lr.cat)
dat.train$ager2 <- as.factor(dat.train$ager2)
dat.train$marital_r1 <- as.factor(dat.train$marital_r1)
dat.train$gender <- as.factor(dat.train$gender)
dat.train$eu_trust <- as.factor(dat.train$eu_trust)
dat.train <- dat.train[order(dat.train$nuts, dat.train$marital_r1, dat.train$ager2),]

dat.train.x <- makeind(dat.train[,-6])

dat.test <- expand.grid(unique(dat.train$nuts),  
                        c("Left", "Right", "Centre"), levels(std_eb$ager2), 
                        c("Single", "(Re-)Married", "Widow", "Divorced or separated"),
                        c("Woman", "Man"))

colnames(dat.test) <- c("nuts", "lr.cat", "ager2", "marital_r1", "gender")
dat.test <- dat.test[order(dat.test$nuts, dat.test$marital_r1, dat.test$ager2),]

dat.test <- merge(dat.test, unique(dat.train[,c(1,7:16)]), by = "nuts")
dat.test.x <- makeind(dat.test)

set.seed(99)
system.time(bart.synth.ext <- bart(x.train=dat.train.x,
                               y.train=dat.train[,6],
                               x.test=dat.test.x,
                               binaryOffset=0,
                               ntree = 200,
                               k=10,
                               ndpost=1000))

plot(bart.synth.ext)
bart.synth.ext$yhat.test.p <- pnorm(bart.synth.ext$yhat.test)
bart.meds <- colMeans(bart.synth.ext$yhat.test)
bart.meds.p <- colMeans(bart.synth.ext$yhat.test.p)

dat.test$preds <-  bart.meds.p

final.bart.p <- merge(dat.test, extsynth[,c("nuts", "lr.cat", "ager2", "marital_r1", 
                                            "gender", "Share.adj")], 
                      by=c("nuts", "lr.cat", "ager2", "marital_r1", "gender"))

bart.reg <- ddply(final.bart.p, .(nuts), summarize, bart.p.sj=weighted.mean(preds, Share.adj),
                  bart.sj=mean(preds))

bart.reg <- na.omit(bart.reg)
results <- merge(results, bart.reg, by="nuts")

#Pearson's rho
cor(results$real, results[,c(2,7,8,11,12,14,15,16,17)])

#Errors
sapply(results$real - results[,c(2,7,8,11,12,14,15,16,17)], mae) 

sapply(results$real - results[,c(2,7,8,11,12,14,15,16,17)], rmse) 

###
#Compile results
mae.res <- data.frame(round(sapply(results$real - results[,c(2,7,8,11,12,14,15,16,17)], mae) 
      - sapply(results$real - results[,c(2,7,8,11,12,14,15,16,17)], mae)[1], 3))
names(mae.res) <- c("mae")

mae.result <- data.frame(row.names(mae.res)[2:9], mae.res[-c(1),])
names(mae.result) <- c("strategy", "mae")

rmse.res <- data.frame(round(sapply(results$real - results[,c(2,7,8,11,12,14,15,16,17)], rmse) 
      - sapply(results$real - results[,c(2,7,8,11,12,14,15,16,17)], rmse)[1], 3))
names(rmse.res) <- c("rmse")

rmse.result <- data.frame(row.names(rmse.res)[2:9], rmse.res[-c(1),])
names(rmse.result) <- c("strategy", "rmse")

#Coverage for ML models
prop.table(table(results$cov.post.cj))
prop.table(table(results$cov.post.sj))

results$cov.ml.cj<- 0
results$cov.ml.cj[results$ml.cj>results$real.ci.l & results$ml.cj<results$real.ci.u] <- 1
prop.table(table(results$cov.ml.cj))

results$cov.ml.sj<- 0
results$cov.ml.sj[results$ml.sj>results$real.ci.l & results$ml.sj<results$real.ci.u] <- 1
prop.table(table(results$cov.ml.sj))

#Coverage for BART
results$cov.bart.cj<- 0
results$cov.bart.cj[results$bart.cj>results$real.ci.l & results$bart.cj<results$real.ci.u] <- 1
prop.table(table(results$cov.bart.cj))

results$cov.bart.sj<- 0
results$cov.bart.sj[results$bart.sj>results$real.ci.l & results$bart.sj<results$real.ci.u] <- 1
prop.table(table(results$cov.bart.sj))

results$cov.bart.p.cj<- 0
results$cov.bart.p.cj[results$bart.p.cj>results$real.ci.l & results$bart.p.cj<results$real.ci.u] <- 1
prop.table(table(results$cov.bart.p.cj))

results$cov.bart.p.sj<- 0
results$cov.bart.p.sj[results$bart.p.sj>results$real.ci.l & results$bart.p.sj<results$real.ci.u] <- 1
prop.table(table(results$cov.bart.p.sj))

#Plotting the results against the representative means

#Flash and disaggregation
disag <- ggplot(results, aes(x=steb, y=real)) +
  geom_point(size = 1) +
  geom_smooth(method = "lm") +
  geom_abline(intercept = 0, slope = 1) +
  xlim(0,1) + ylim(0,1) +
  xlab("Predicted Mean") +
  ylab("Regional Representative Mean") +
  ggtitle("Disaggregation") +
  annotate("text", x=.15, y=0.99, label="r == 0.453", size=3, parse=T) +
  annotate("text", x=.15, y=0.94, label="cov == 0.332", size=3, parse=T) +
  annotate("text", x=.15, y=0.89, label="MAE == 0.118", size=3, parse=T) +
  annotate("text", x=.15, y=0.84, label="RMSE == 0.146", size=3, parse=T)

#Flash and post.cj
postcj <- ggplot(results, aes(x=post.cj, y=real)) +
  geom_point(size = 1) +
  geom_smooth(method = "lm") +
  geom_abline(intercept = 0, slope = 1) +
  xlim(0,1) + ylim(0,1) +
  xlab("Predicted Mean") +
  ylab("Regional Representative Mean") +
  ggtitle("Classical MrP") +
  annotate("text", x=.15, y=0.99, label="r == 0.557", size=3, parse=T) +
  annotate("text", x=.15, y=0.94, label="cov == 0.474", size=3, parse=T) +
  annotate("text", x=.15, y=0.89, label="MAE == 0.083", size=3, parse=T) +
  annotate("text", x=.15, y=0.84, label="RMSE == 0.103", size=3, parse=T)

#Flash and post.sj
postsj <- ggplot(results, aes(x=post.sj, y=real)) +
  geom_point(size = 1) +
  geom_smooth(method = "lm") +
  geom_abline(intercept = 0, slope = 1) +
  xlim(0,1) + ylim(0,1) +
  xlab("Predicted Mean") +
  ylab("Regional Representative Mean") +
  ggtitle("Synthetic MrP") +
  annotate("text", x=.15, y=0.99, label="r == 0.550", size=3, parse=T) +
  annotate("text", x=.15, y=0.94, label="cov == 0.436", size=3, parse=T) +
  annotate("text", x=.15, y=0.89, label="MAE == 0.092", size=3, parse=T) +
  annotate("text", x=.15, y=0.84, label="RMSE == 0.115", size=3, parse=T)

#Flash and ps.cj BART
bartpcj <- ggplot(results, aes(x=bart.p.cj, y=real)) +
  geom_point(size = 1) +
  geom_smooth(method = "lm") +
  geom_abline(intercept = 0, slope = 1) +
  xlim(0,1) + ylim(0,1) +
  ggtitle("BART (classical joints)") +
  ylab("Regional Representative Mean") +
  xlab("Predicted Mean") +
  annotate("text", x=.15, y=0.99, label="r == 0.555", size=3, parse=T) +
  annotate("text", x=.15, y=0.94, label="cov == 0.542", size=3, parse=T) +
  annotate("text", x=.15, y=0.89, label="MAE == 0.079", size=3, parse=T) +
  annotate("text", x=.15, y=0.84, label="RMSE == 0.098", size=3, parse=T)

#Flash and ps.sj BART
bartpsj <- ggplot(results, aes(x=bart.p.sj, y=real)) +
  geom_point(size = 1) +
  geom_smooth(method = "lm") +
  geom_abline(intercept = 0, slope = 1) +
  xlim(0,1) + ylim(0,1) +
  ggtitle("BART (synthetic joints)") +
  ylab("Regional Representative Mean") +
  xlab("Predicted Mean") +
  annotate("text", x=.15, y=0.99, label="r == 0.576", size=3, parse=T) +
  annotate("text", x=.15, y=0.94, label="cov == 0.574", size=3, parse=T) +
  annotate("text", x=.15, y=0.89, label="MAE == 0.067", size=3, parse=T) +
  annotate("text", x=.15, y=0.84, label="RMSE == 0.087", size=3, parse=T)

grid.arrange(disag, postcj, postsj, bartpcj, bartpsj, nrow=2)
dev.copy2pdf(width = 8, height = 7, file = "Figure_1.pdf")
dev.off()

#Plotting results of non-post-stratified estimations (Figure A1)
mlcj <- ggplot(results, aes(x=ml.cj, y=real)) +
  geom_point(size = 1) +
  geom_smooth(method = "lm") +
  geom_abline(intercept = 0, slope = 1) +
  xlim(0,1) + ylim(0,1) +
  ggtitle("Classical MrP") +
  xlab("Predicted Mean") +
  ylab("Regional Representative Mean") +
  annotate("text", x=.15, y=0.99, label="R^2 == 0.537", size=3, parse=T) +
  annotate("text", x=.15, y=0.94, label="cov == 0.471", size=3, parse=T) +
  annotate("text", x=.15, y=0.89, label="MAE == 0.088", size=3, parse=T) +
  annotate("text", x=.15, y=0.84, label="RMSE == 0.109", size=3, parse=T)

#Flash and ml.sj
mlsj <- ggplot(results, aes(x=ml.sj, y=real)) +
  geom_point(size = 1) +
  geom_smooth(method = "lm") +
  geom_abline(intercept = 0, slope = 1) +
  xlim(0,1) + ylim(0,1) +
  ggtitle("Synthetic MrP") +
  xlab("Predicted Mean") +
  ylab("Regional Representative Mean") +
  annotate("text", x=.15, y=0.99, label="R^2 == 0.545", size=3, parse=T) +
  annotate("text", x=.15, y=0.94, label="cov == 0.458", size=3, parse=T) +
  annotate("text", x=.15, y=0.89, label="MAE == 0.089", size=3, parse=T) +
  annotate("text", x=.15, y=0.84, label="RMSE == 0.112", size=3, parse=T)

#Flash and cj BART
bartcj <- ggplot(results, aes(x=bart.cj, y=real)) +
  geom_point(size = 1) +
  geom_smooth(method = "lm") +
  geom_abline(intercept = 0, slope = 1) +
  xlim(0,1) + ylim(0,1) +
  ggtitle("BART (classical joints)") +
  xlab("Predicted Mean") +
  ylab("Regional Representative Mean") +
  annotate("text", x=.15, y=0.99, label="R^2 == 0.554", size=3, parse=T) +
  annotate("text", x=.15, y=0.94, label="cov == 0.542", size=3, parse=T) +
  annotate("text", x=.15, y=0.89, label="MAE == 0.078", size=3, parse=T) +
  annotate("text", x=.15, y=0.84, label="RMSE == 0.098", size=3, parse=T)

#Flash and sj BART
bartsj <- ggplot(results, aes(x=bart.sj, y=real)) +
  geom_point(size = 1) +
  geom_smooth(method = "lm") +
  geom_abline(intercept = 0, slope = 1) +
  xlim(0,1) + ylim(0,1) +
  ggtitle("BART (synthetic joints)") +
  xlab("Predicted Mean") +
  ylab("Regional Representative Mean") +
  annotate("text", x=.15, y=0.99, label="R^2 == 0.580", size=3, parse=T) +
  annotate("text", x=.15, y=0.94, label="cov == 0.587", size=3, parse=T) +
  annotate("text", x=.15, y=0.89, label="MAE == 0.067", size=3, parse=T) +
  annotate("text", x=.15, y=0.84, label="RMSE == 0.086", size=3, parse=T)

grid.arrange(disag, mlcj, mlsj, bartcj, bartsj, nrow=2)
dev.copy2pdf(width = 8, height = 7, file = "Figure_A1.pdf")
dev.off()
